
f <- "../../voter_file_data/national/post_2020/VM2--NC--2021-01-28/VM2--NC--2021-01-28-DEMOGRAPHIC.tab"

k <- fread(f, sep = "\t",
           select = c("LALVOTERID", "CountyEthnic_Description",
                      "Voters_BirthDate",
                      "Residence_Addresses_Latitude",
                      "Residence_Addresses_Longitude",
                      "Voters_Gender", "Voters_Age", "Parties_Description",
                      "Voters_OfficialRegDate", "US_Congressional_District",
                      "Voters_FIPS", "Voters_LastName",
                      "Voters_FirstName",
                      "Residence_Addresses_City",
                      "Residence_Addresses_Zip",
                      "Residence_Addresses_HouseNumber",
                      "Residence_Addresses_PrefixDirection",
                      "Residence_Addresses_StreetName",
                      "Residence_Addresses_Designator",
                      "Residence_Addresses_SuffixDirection",
                      "Residence_Addresses_ApartmentNum"))

k <- clean_streets(k, c("Residence_Addresses_HouseNumber",
                        "Residence_Addresses_PrefixDirection",
                        "Residence_Addresses_StreetName",
                        "Residence_Addresses_Designator",
                        "Residence_Addresses_SuffixDirection"))

k <- mutate(k, street = ifelse(is.na(Residence_Addresses_ApartmentNum) | 
                                 trimws(Residence_Addresses_ApartmentNum) == "",
                               street,
                               paste(street, Residence_Addresses_ApartmentNum)))

k <- mutate(k,
            street = trimws(toupper(gsub('[[:punct:] ]+', ' ', street))))

ad_cl <- fread("raw_data/address_cleaner.csv") |> 
  filter(search != replace) |> 
  mutate(across(c(search, replace), toupper)) |> 
  distinct()

for(i in c(1:nrow(ad_cl))){
  print(i)
  k$street <- gsub(paste0("(^|\\s)", ad_cl$search[i], "(\\s|$)"),
                   paste0("\\1", ad_cl$replace[i], "\\2"),
                   k$street)
}

k <- k |> 
  mutate(across(c("Voters_LastName", "Voters_FirstName"),
                ~ gsub("[[:punct:]]| ", "", ifelse(. == "", NA, tolower(iconv(., "WINDOWS-1252", "UTF-8"))))))
hold <- k
#####################
k <- filter(k, !is.na(Residence_Addresses_Longitude))

blocks <- tigris::blocks("NC", year = 2020, class = "sp")

k <- rename(k, surname = Voters_LastName)


okay <- filter(k, !(CountyEthnic_Description %in% c("", "Other Undefined Race"))) |> 
  mutate(pred.whi = CountyEthnic_Description == "White Self Reported",
         pred.bla = CountyEthnic_Description == "African or Af-Am Self Reported",
         pred.his = CountyEthnic_Description == "Hispanic",
         pred.asi = CountyEthnic_Description == "East Asian")

k <- filter(k, (CountyEthnic_Description %in% c("", "Other Undefined Race")))

pings  <- SpatialPoints(k[,c("Residence_Addresses_Longitude",
                             "Residence_Addresses_Latitude")],
                        proj4string = blocks@proj4string)

k$GEOID <- over(pings, blocks)$GEOID

k$block <- str_sub(k$GEOID, start= -4)
k$tract <- str_sub(k$GEOID, 6, 11)
k$county <- str_sub(k$GEOID, 3, 5)

k <- filter(k,
            paste0("37", county, tract, block) %in% blocks$GEOID20)

k2 <- select(k, LALVOTERID, surname, county, tract) |> 
  mutate(state = "NC")

pop <- get_decennial("block", variables = c("hisp" = "P2_002N",
                                            "white" = "P2_005N",
                                            "black" = "P2_006N",
                                            "asian" = "P2_008N",
                                            "aian" = "P2_007N",
                                            "total" = "P2_001N"),
                     state = "NC", year = 2020, output = "wide") |> 
  mutate(other = total - white - black - hisp - asian - aian) |> 
  select(-total, -NAME)


state_pop <- get_decennial("state", variables = c("hisp" = "P2_002N",
                                                  "white" = "P2_005N",
                                                  "black" = "P2_006N",
                                                  "asian" = "P2_008N",
                                                  "aian" = "P2_007N",
                                                  "total" = "P2_001N"),
                           state = "NC", year = 2020, output = "wide") |> 
  mutate(other = total - white - black - hisp - asian - aian,
         across(c(other, white, black, hisp, asian, aian), ~ . / total)) |> 
  select(-total, -NAME, -GEOID) |> 
  pivot_longer(cols = c(other, white, black, hisp, asian, aian))

state_pop_v <- state_pop$value
names(state_pop_v) <- state_pop$name

out_b <- bisg(~ nm(surname) + GEOID, data = k, p_r = state_pop_v,
              p_rgx = pop)
#################################

k2 <- cbind(k2, out_b)

k2 <- rename(k2, pred.whi = pr_white,
            pred.bla = pr_black,
            pred.his = pr_hisp,
            pred.asi = pr_asian)

k <- left_join(
  select(k, -county, -tract, -block, -GEOID),
  select(k2, LALVOTERID, starts_with("pred"))
)

rm(k2)

k <- bind_rows(okay, k)

block_groups <- tigris::block_groups("NC", year = 2020, class = "sp")

pings  <- SpatialPoints(k[,c("Residence_Addresses_Longitude",
                             "Residence_Addresses_Latitude")],
                        proj4string = block_groups@proj4string)

k$GEOID <- over(pings, block_groups)$GEOID

k <- select(k,
            LALVOTERID,
            lat = Residence_Addresses_Latitude,
            lon = Residence_Addresses_Longitude,
            gender = Voters_Gender,
            voter_age = Voters_Age,
            street,
            party = Parties_Description,
            Residence_Addresses_City,
            reg_date = Voters_OfficialRegDate,
            cong = US_Congressional_District,
            Voters_BirthDate,
            starts_with("pred."),
            GEOID,
            surname) |> 
  mutate(male = as.numeric(gender == "M"),
         dem =  as.numeric(party == "Democratic"),
         rep =  as.numeric(party == "Republican"))

hist <- fread("../../voter_file_data/national/post_2020/VM2--NC--2021-01-28/VM2--NC--2021-01-28-VOTEHISTORY.tab",
              select = c("LALVOTERID",
                         "General_2012_11_06",
                         "General_2014_11_04",
                         "General_2016_11_08",
                         "General_2018_11_06",
                         "General_2020_11_03",
                         "BallotType_General_2012_11_06",
                         "BallotType_General_2014_11_04",
                         "BallotType_General_2016_11_08",
                         "BallotType_General_2018_11_06",
                         "BallotType_General_2020_11_03")) |> 
  mutate(across(starts_with("Gene"), ~ ifelse(. == "Y", 1, 0)))

k <- left_join(k, hist)

source("code/99_helpers/census_script.R")

c <- get_basic_census_stats("block group", 2020, "NC") |> 
  select(GEOID,
         median_income, some_college, pop_dens)

k <- left_join(k, c)

saveRDS(k, "temp/cleaned_file_nc.rds")

